数据导入¶
In [96]:
import pandas as pd
In [97]:
df = pd.read_csv("heart.csv")
数据分析¶
In [98]:
# 查看前五行
df.head()
Out[98]:
In [99]:
df.shape
Out[99]:
In [100]:
# 查看后五行
df.tail()
Out[100]:
In [101]:
# 查看列名
df.columns
Out[101]:
In [102]:
# 查看统计分布
df.describe()
Out[102]:
In [103]:
# 查看数据集信息
df.info()
In [104]:
# 缺失值统计
df.isnull().sum()
Out[104]:
一行代码产生数据探索性EDA报告¶
In [105]:
import pandas_profiling
In [106]:
profile = pandas_profiling.ProfileReport(df)
In [107]:
# 生成数据集报告
profile
Out[107]:
In [108]:
# 将报告保存到本地
profile.to_file('profile.html')
数据库可视化分析¶
In [109]:
import matplotlib.pyplot as plt
import seaborn as sns
In [110]:
# 特征两两相关性分析
df.corr()
Out[110]:
In [111]:
# 可视化热力图
plt.figure(figsize=(10,10), dpi=400)
sns.heatmap(df.corr(), annot=True, fmt='.1f', square=True) # annot是否显示数字
plt.show()
In [112]:
# 查看api
sns.heatmap?
In [ ]:
# 绘制两两散点图
sns.pairplot(df)
plt.show()
In [ ]:
# 单个特征统计分布分析
sns.distplot(df['age'])
plt.show()
In [ ]:
df["age"].max()
In [ ]:
df.age.max()
In [ ]:
# 查看不同的数字
df.age.unique()
In [ ]:
# 每个元素出现的次数
df.target.value_counts()
In [26]:
# 对一列的数据集统计分析
sns.countplot(x="target", data=df, palette="bwr")
plt.show()
In [27]:
sns.countplot(x="sex", data=df, palette="mako_r")
plt.xlabel("Sex(0 =female, 1=male)")
plt.show()
In [28]:
# 单列特征与标签的关系
pd.crosstab(df.age, df.target).plot(kind="bar", figsize=(20, 6))
plt.title("Hear Disease Frequency for Ages")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.savefig("heartDiseaseAndAges.png")
plt.show()
In [29]:
# 箱形图
sns.boxplot(x=df.target,y=df.age)
plt.show()
In [30]:
# 小提琴图
sns.violinplot(x=df.target, y=df.age)
plt.show()
In [31]:
# 散点图不同年龄段和不同最大心率的患病情况
plt.scatter(x=df.age[df.target==1], y=df.thalach[df.target==1], c="red")
plt.scatter(x=df.age[df.target==0], y=df.thalach[df.target==0], c="blue")
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()
In [32]:
# 忽略烦人的红色提示
import warnings
warnings.filterwarnings("ignore")
In [33]:
# 简写列名修改为完整列名
df.columns = ['age', 'sex', 'chest_pain_type','resting_blood_pressure','cholesterol','fasting_blood_sugar','rest_ecg',
'max_hear_rate_achieced','exercise_induced_angina', 'st_depression',
'st_slope','num_major_vessels','thalassemin','target']
In [34]:
df.head()
Out[34]:
In [35]:
# 定类特征的整数编码转换成字符串
df["sex"][df.sex == 0] = 'female'
df["sex"][df.sex == 1] = 'male'
df['chest_pain_type'][df.chest_pain_type == 0] = "typical angina"
df['chest_pain_type'][df.chest_pain_type == 1] = "atypical angina"
df['chest_pain_type'][df.chest_pain_type == 2] = "non-anginal pain"
df['chest_pain_type'][df.chest_pain_type == 3] = "asymptomatic"
df["fasting_blood_sugar"][df.fasting_blood_sugar == 0] = "lower than 120mg/ml"
df["fasting_blood_sugar"][df.fasting_blood_sugar == 1] = "greater than 120mg/ml"
df['rest_ecg'][df.rest_ecg == 0] = "normal"
df['rest_ecg'][df.rest_ecg == 1] = "ST-T wave abnormality"
df['rest_ecg'][df.rest_ecg == 2] = "left ventricular hypertrophy"
df['exercise_induced_angina'][df.exercise_induced_angina == 0] = "yes"
df['exercise_induced_angina'][df.exercise_induced_angina == 1] = "no"
df['st_slope'][df.st_slope == 0] ='upsloping'
df['st_slope'][df.st_slope == 1] ='flat'
df['st_slope'][df.st_slope == 2] ='downsloping'
df['thalassemin'][df.thalassemin == 0] = 'unknown'
df['thalassemin'][df.thalassemin == 1] = 'normal'
df['thalassemin'][df.thalassemin == 2] = 'fixed defect'
df['thalassemin'][df.thalassemin == 3] = 'reversable defect'
In [36]:
df.head()
Out[36]:
In [37]:
# one-hot编码(对obj的进行分列)
df = pd.get_dummies(df)
In [38]:
df.columns
Out[38]:
In [39]:
df.head()
Out[39]:
In [40]:
# 导出处理好的数据集
df.to_csv("process_heart.csv", index=False)
pdpbox工具包可视化¶
In [41]:
from pdpbox import pdp, get_dataset, info_plots
In [43]:
fig, axed, summary_df = info_plots.target_plot(
df=df,feature='sex_male', feature_name='gender', target=['target']
)
#_ = axes['bar_ax'].set_xticklabels({"Female", 'Male'})
In [44]:
fig, axed, summary_df = info_plots.target_plot(
df=df,feature='age', feature_name='age', target=['target']
)
In [45]:
# 两两之间的关系
feat_name1 = 'age'
nick_name1 = 'age'
feat_name2 = 'max_hear_rate_achieced'
nick_name2 = 'max_hart_rate'
fig, axed, summary_df = info_plots.target_plot_interact(
df=df,features=[feat_name1, feat_name2], feature_names=[nick_name1,nick_name2], target=['target']
)
plt.show()
随机森林做分类¶
In [46]:
# 除了target列,全部取出来
X = df.drop('target', axis=1)
In [47]:
X.shape
Out[47]:
In [48]:
Y = df.target
In [49]:
Y
Out[49]:
In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
In [51]:
X.shape
Out[51]:
In [52]:
X_test.shape
Out[52]:
In [53]:
from sklearn.ensemble import RandomForestClassifier
In [54]:
model = RandomForestClassifier(max_depth=5, n_estimators=10, random_state=5)
model.fit(X_train, Y_train)
Out[54]:
In [55]:
# 指定索引为7的决策树
estimator = model.estimators_[7]
In [56]:
estimator
Out[56]:
In [57]:
feature_names = X_train.columns
Y_train_str = Y_train.astype('str')
Y_train_str[Y_train_str == '0'] = 'no disease'
Y_train_str[Y_train_str == '1'] = 'disease'
Y_train_str = Y_train_str.values
In [58]:
# 决策树可视化
from sklearn.tree import export_graphviz
import os
os.environ["PATH"]+= os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/
export_graphviz(estimator, out_file='tree.dot',
feature_names=feature_names,
class_names=Y_train_str,
rounded=True,proportion=True,
label='root',
precision=2,filled=True)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
from IPython.display import Image
Image(filename='tree.png')
In [59]:
# 特征重要性
model.feature_importances_
Out[59]:
In [60]:
import numpy as np
print("特征排序:")
feature_names = X_test.columns
feature_importances = model.feature_importances_
indices = np.argsort(feature_importances)[::-1]
for index in indices:
print("feature %s (%f)"%(feature_names[index], feature_importances[index]))
In [61]:
# 各个特征的权重
import eli5
eli5.show_weights(estimator, feature_names=feature_names.to_list())
Out[61]:
In [62]:
plt.figure(figsize=(16,8))
plt.title("Feature Importance")
plt.bar(range(len(feature_importances)), feature_importances[indices], color='b')
plt.xticks(range(len(feature_importances)), np.array(feature_names)[indices], color='b', rotation=90)
plt.show()
预测分类结果¶
In [63]:
X_test.shape
Out[63]:
In [64]:
X_test.head()
Out[64]:
单个数据预测¶
In [65]:
test_sample = X_test.iloc[2]
In [66]:
test_sample.shape
Out[66]:
In [67]:
# 形成二维数组
test_sample = np.array(test_sample).reshape(1,-1)
In [68]:
test_sample.shape
Out[68]:
In [69]:
# 二分类定性的分类结果
model.predict(test_sample)
Out[69]:
In [70]:
# 二分类定量的分类结果
model.predict_proba(test_sample)
Out[70]:
全部数据预测¶
In [71]:
model.predict(X_test)
Out[71]:
In [72]:
model.predict_proba(X_test)
Out[72]:
In [73]:
# 患病的置信度
model.predict_proba(X_test)[:,1]
Out[73]:
In [74]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
混淆矩阵¶
In [75]:
from sklearn.metrics import confusion_matrix
In [76]:
confusion_matrix_model = confusion_matrix(Y_test, y_pred)
In [77]:
confusion_matrix_model
Out[77]:
In [78]:
# 混淆矩阵绘制模板
import itertools
def cnf_matrix_plotter(cm, classes):
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Oranges)
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
threshold = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment='center',
color='white'if cm[i,j] > threshold else "black",
fontsize=25)
plt.tight_layout()
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.show()
In [79]:
cnf_matrix_plotter(confusion_matrix_model, ['Healthy', 'Disease'])
ROC曲线¶
In [80]:
y_pred_quant = model.predict_proba(X_test)[:,1]
In [81]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, threshold = roc_curve(Y_test, y_pred_quant)
In [82]:
fpr
Out[82]:
In [83]:
tpr
Out[83]:
In [84]:
# 阈值
threshold
Out[84]:
In [85]:
# 绘制ROC曲线
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], ls='--', c=".3")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title("ROC curse")
plt.xlabel("False Positice Rate(1 - Specificity)")
plt.ylabel("True Positive Rate (Sensitivity)")
plt.grid(True)
In [86]:
auc(fpr, tpr)
Out[86]:
绘制Permutation Importance图¶
In [87]:
import eli5
from eli5.sklearn import PermutationImportance
# 打乱训练来看特征重要性
perm = PermutationImportance(model, random_state=1).fit(X_test, Y_test)
eli5.show_weights(perm, feature_names=X_test.columns.tolist())
Out[87]:
In [88]:
fig, axes, summary_df = info_plots.actual_plot(
model=model, X=X_train, feature='sex_male', feature_name='gender',predict_kwds={}
)
In [89]:
fig, axes, summary_df = info_plots.actual_plot(
model=model, X=X_train, feature='num_major_vessels', feature_name='gender',predict_kwds={}
)
In [90]:
feat_name = 'num_major_vessels'
nick_name = 'num_vessels'
pdp_dist = pdp.pdp_isolate(
model=model, dataset=X_test, model_features=feature_names, feature=feat_name
)
fig, axes = pdp.pdp_plot(pdp_dist, nick_name, plot_lines=True, frac_to_plot=0.8, plot_pts_dist=True)
In [91]:
pdp_dist = pdp.pdp_isolate(
model=model, dataset=X_test, model_features=feature_names, feature="max_hear_rate_achieced"
)
fig, axes = pdp.pdp_plot(pdp_dist, 'max_heart_rate')
In [92]:
for each in feature_names:
feat_name = each
pdp_dist = pdp.pdp_isolate(
model=model, dataset=X_test, model_features=feature_names, feature=feat_name
)
plt.show()
二维PDP图:特征之间的交互关系分析¶
In [93]:
feat_name1 = "max_hear_rate_achieced"
nick_name1 = "max_hear_rate"
feat_name2 = "num_major_vessels"
nick_name2 = "num_vessels"
inter1 = pdp.pdp_interact(
model=model, dataset=X_test, model_features=feature_names, features=[feat_name1, feat_name2]
)
fig, axes = pdp.pdp_interact_plot(
pdp_interact_out=inter1, feature_names=[nick_name1, nick_name2], plot_type="contour", x_quantile=True, plot_pdp=True)
shap机器学习可解释性分析包¶
In [95]:
import shap
shap.initjs()
In [113]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
In [114]:
len(shap_values)
Out[114]:
In [115]:
shap_values[0].shape
Out[115]:
In [117]:
shap_values[1].shape
Out[117]:
In [119]:
# 患病和不患病的平均概率
explainer.expected_value
Out[119]:
In [120]:
shap.summary_plot(shap_values[1], X_test, plot_type='bar')
In [122]:
# 每一行表示一个特征,红色表示该特征的值较高的数据点,越靠右的电表示该特征对患病影响正相关越高
shap.summary_plot(shap_values[1], X_test)
In [126]:
shap.summary_plot(shap_values[1], X_test, plot_type='violin')
In [128]:
# 对于单个病人
idx = 126
patient = X.iloc[idx,:]
In [136]:
patient
Out[136]: